Imports

library(evd); 
library(evdbayes);
library(coda);
library(ismev);
Lade nötiges Paket: mgcv
Lade nötiges Paket: nlme
This is mgcv 1.8-24. For overview type 'help("mgcv-package")'.
library(xts);
Lade nötiges Paket: zoo

Attache Paket: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric
library(ggplot2);

Loading the Data

load("../data/CAPE_Minder_Rychener_Malsot.RData")
load("../data/NINO34.RData")
load("../data/SRH_Minder_Rychener_Malsot.RData")

Generate PROD

prod = sqrt(cape)*srh

** Create Time Series Objects **

dates <- seq.Date(as.Date("1979-1-1"), as.Date("2015-12-31"), by=1)
feb29ix <- format(as.Date(dates), "%m-%d") == "02-29"
dates <- dates[!feb29ix]
prod_ts = xts(prod, order.by = rep(dates, each=8))

Plot of Time Series

ts_plot = autoplot(prod_ts) + xlab("Time") + ylab("PROD")
ggsave("../plots/full_time_series.pdf", plot=ts_plot, width = 6, height = 3)
ggsave("../plots/full_time_series.png", plot=ts_plot, width = 3, height = 1.5)
prod_full_monthly_max = apply.monthly(prod_ts, max)
max_ts_plot = autoplot(prod_full_monthly_max) +
  xlab("Time") + ylab("PROD") +
  geom_line(size=.5)
ggsave("../plots/monthly_max_series.pdf", plot=max_ts_plot, width = 6, height = 3)
ggsave("../plots/monthly_max_series.png", plot=max_ts_plot, width = 3, height = 1.5)
nino34_plot = ggplot() + geom_line(aes(index(prod_full_monthly_max), nino34)) +
  xlab("Time") + ylab("NINO 3.4")
ggsave("../plots/nino34_plot.pdf", plot=nino34_plot, width = 6, height = 3)
ggsave("../plots/nino34_plot.png", plot=nino34_plot, width = 3, height = 1.5)

Group Monthly

month_names = c("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec")
get_monthly = function(x) {
  output = list()
  len = nrow(x)
  
  # Get month of element
  month = time(x)
  month = gsub("....-", "", month)
  month = gsub("-..", "", month)
  monthlist = unique(month)
  for (i in 1:12) {
    output[[i]] = x[month == monthlist[i],]
  }
  names(output) = month_names
  return(output)
}
monthly_max = get_monthly(apply.monthly(prod_ts, max))
r = 2
r_monthly_max = get_monthly(apply.monthly(prod_ts, function(x) order(x, decreasing=T)[1:r]))

Plot Monthly Time Series

for (i in 1:12) {
  tmp = autoplot(monthly_max[[i]]) + xlab("Time") + ylab("PROD")
  ggsave(paste("../plots/monthly_max_series/", sprintf("%02d", i), "monthly_max_series.pdf", sep=""), 
         plot=tmp, width = 6, height = 3)
}
# get the monthly maxima of prod
m1 = as.data.frame(apply.monthly(prod_ts, max))$V1;
# produce the gumbel qq plot
gumbel_qq = function(x, title="") qqplot(qgumbel(c(1:length(x))/(length(x)+1)),
                                         x,
                                         main = paste("Gumbell Q-Q Plot", title),
                                         xlab = "Theoretical Quantiles", 
                                         ylab = "Sample Quantiles") ;
gumbel_qq(m1)

#qqplot(qgumbel(c(1:length(m1))/(length(m1)+1)),m1,main = "Gumbell Q-Q Plot",xlab = "Theoretical Quantiles", ylab = "Sample Quantiles") ;

The qq plot doesn’t fit very well, especially in the lower tail. This is likely due to seasonal dependence.

Fitting GEV to the entire data

# fit gevd with MLE and produce diagnostic plots
fitmax.MLE<-fgev(m1,method="Nelder-Mead")
par(mfrow=c(2,2))
fitmax.MLE

Call: fgev(x = m1, method = "Nelder-Mead") 
Deviance: 9272.599 

Estimates
      loc      scale      shape  
7.519e+03  7.013e+03  4.757e-03  

Standard Errors
      loc      scale      shape  
421.90201  331.43749    0.05347  

Optimization Information
  Convergence: successful 
  Function Evaluations: 171 
plot(fitmax.MLE)

Poor fit, probably because the distribution isn’t stationary. This is especially visible in the Probability plot, in which the confidence band is surpassed, indicating a poor fit.

# fit gevd with Bayesian Techniques
# use the previous outputs (rounded) as initial values (use a different shape)
init<-c(1.6e4,4e3,0.1)
# arbitrary priors
mat <- diag(c(10000,10000,100)) 
pn <- prior.norm(mean=c(0,0,0),cov=mat)
# proposal standard deviation (found by trial and error to get 20%<acceptance rate<40%)
psd<-c(500,0.03,0.02)
# draw 3k samples from markov chain
nit = 30000
thinning = 300
post<-posterior(nit, init=init,prior=pn,lh="gev",data=m1,psd=psd)
# diagnostic plots
MCMC<-mcmc(post[1:nit %% thinning == 0, ], thin=300) 
plot(MCMC) 

attr(mcmc(post),"ar")
            mu sigma   xi total
acc.rates 0.24  0.71 0.70  0.55
ext.rates 0.00  0.00 0.01  0.00
#MCMC_stab <- mcmc(post, thin=50, start=1000)
acf(mcmc(post[1:nit %% thinning == 0, ]))

We observe that there seem to be no substantial issues from the autocorrelation plots.

apply(mcmc(post[1:nit %% thinning == 0, ]),2,mean)
           mu         sigma            xi 
  236.0524535 11181.3806761    -0.1897774 
apply(mcmc(post[1:nit %% thinning == 0, ]),2,sd)
          mu        sigma           xi 
 98.60894002 624.46021143   0.03409059 

** Fit with MLE for months separately**

#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
monthly_fits = list()
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  if (i %in% error_cases) {
    monthly_fits[[i]] = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  }
  else {
    monthly_fits[[i]] = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead")
  }
}
optimization may not have succeeded
names(monthly_fits) = names(monthly_max)
for (i in 1:12) {
  pdf(paste("../plots/monthly_mle_diag/", sprintf("%02d", i), "_monthly_mle_diag.pdf", sep=""),
      width=7, height=7)
  par(mfrow=c(2,2))
  plot(monthly_fits[[i]])
  dev.off()
}
gumbel_qq(data.frame(monthly_max[[9]])[,1], "September")

gumbel_qq(data.frame(monthly_max[[12]])[,1], "December")

Fit R largest order statistics

largest_10 = get_monthly(apply.monthly(prod_ts, function(x) order(x, decreasing=T)[1:4]))
largest_2_fit = lapply(largest_10, function(x) rlarg.fit(x[ , 1:2]))
$conv
[1] 0

$nllh
[1] 381.7301

$mle
[1] 172.4434267  63.1813324  -0.8246329

$se
[1] 10.3518367  6.2427031  0.1133745

$conv
[1] 0

$nllh
[1] 366.2861

$mle
[1] 155.5451810  51.4464515  -0.7267874

$se
[1] 8.3567636 4.6071290 0.1019313

$conv
[1] 0

$nllh
[1] 348.4217

$mle
[1] 195.3919929  40.8441452  -0.8324715

$se
[1] 6.33288212 4.00545299 0.08820541

$conv
[1] 0

$nllh
[1] 375.033

$mle
[1] 162.9292046  57.8831308  -0.7755148

$se
[1] 9.282767 5.432561 0.099956

$conv
[1] 0

$nllh
[1] 380.4256

$mle
[1] 157.2651459  61.7442705  -0.6554637

$se
[1] 11.5888901  5.2748342  0.1508781

$conv
[1] 0

$nllh
[1] 364.4628

$mle
[1] 104.8006071  51.1390400  -0.1253138

$se
[1] 9.153534 4.541184 0.147210

$conv
[1] 0

$nllh
[1] 386.7889

$mle
[1] 128.6412577  66.3424033  -0.4883028

$se
[1] 11.877915  5.260236  0.139483

$conv
[1] 0

$nllh
[1] 380.7543

$mle
[1] 130.1181620  62.4551520  -0.3960438

$se
[1] 10.9040496  4.7896609  0.1237002

$conv
[1] 0

$nllh
[1] 361.8412

$mle
[1] 111.0849233  49.8496920  -0.2990619

$se
[1] 8.2812262 3.8324125 0.1013576

$conv
[1] 0

$nllh
[1] 377.358

$mle
[1] 75.1622227 56.9409902  0.2522703

$se
[1] 10.0789585  7.2986557  0.1984848

$conv
[1] 0

$nllh
[1] 376.1717

$mle
[1] 106.7510273  60.2707332  -0.1192325

$se
[1] 10.4205289  5.3759827  0.1333882

$conv
[1] 0

$nllh
[1] 377.2483

$mle
[1] 126.4665127  59.0621082  -0.2939312

$se
[1] 9.9836743 4.7019303 0.1170011
get_se = function(x, ix) {
  if (is.null(x$std.err)) 0
  else x$std.err[ix]
}
mle_loc = unlist(lapply(monthly_fits, function(x) x$estimate[1]))
mle_loc_se = unlist(lapply(monthly_fits, get_se, 1))
mle_scale = unlist(lapply(monthly_fits, function(x) x$estimate[2]))
mle_scale_se = unlist(lapply(monthly_fits, get_se, 2))
mle_shape = unlist(lapply(monthly_fits, function(x) x$estimate[3]))
mle_shape_se = unlist(lapply(monthly_fits, get_se, 3))
plot_w_err = function(x, y, se, se.conf_mult = 1,title = NULL) {
  upper_error = y+se.conf_mult*se
  lower_error = y-se.conf_mult*se
  
  max_y = max(upper_error)
  min_y = min(lower_error)
  
  plot(x, y,
       ylim = c(min_y, max_y),
       main = title)
  arrows(x,upper_error,x,lower_error, code=3, length=0.02, angle = 90)
}
plot_w_err(1:12, mle_loc, mle_loc_se, 1, "Location Parameter as Estimated with Likelihood")

plot_w_err(1:12, mle_scale, mle_scale_se, 1, "Scale Parameter as Estimated with Likelihood")

plot_w_err(1:12, mle_shape, mle_shape_se, 1, "Shape Parameter as Estimated with Likelihood")

** Fit with Bayesian Methods for months separately**



# Fits GEV distribution with bayesian method for given parameters
bayes_fitter = function(x, 
                        init = c(1e3, 1e3, 0.1), # Initial values
                        mat = diag(c(10000,10000,100)),
                        psd = c(500,0.1,0.1), # Proposed SDev
                        nit = 3000, # Nb Iterations
                        thinning = 50, # Thinning Factor
                        do_diagn = FALSE, # Bool whether to show diagnostic plots
                        do_autoreg = FALSE, # Bool whether to show autoreg plots
                        seed = 42 # Seed 
                        ) {
  set.seed(seed)                
  pn = prior.norm(mean=c(0,0,0),cov=mat)
  post = posterior(nit, init=init, prior=pn, lh="gev", data=x, psd=psd)
  
  if(do_diagn) {
    MCMC<-mcmc(post[1:nit %% thinning == 0, ]) 
    plot(MCMC) 
  }
  if(do_autoreg) {
    acf(mcmc(post[1:nit %% thinning == 0, ]))
  }
  list(posterior = post, 
       acceptance_rate = attr(mcmc(post),"ar"))
}



# Iteratively fits GEV with bayesian methods, until the fit has 
# acceptable acceptance rates (i.e. 0.2 < AR < 0.4). If the AR is too high, 
# the proposed SDev for the parameter is multiplied with 1.5. If it's too small,
# the proposed SDev is divided by 2. This is repeated until the acceptance rate
# is good for all parameters, or max_it is reached. Then, a final model is fitted
# with more iterations. 
bayes_fitter_param_search = function(x,
                                     psd_init = c(500,0.1,0.1), # Initial proposed SDev
                                     nit_full = 3000, # Nb Iterations for final model
                                     nit_search = 150, # Nb Iterations for param search
                                     do_diagn = FALSE, # Bool whether to show diagnostic plots
                                     do_autoreg = FALSE, # Bool whether to show autoreg plots
                                     max_it = 20,
                                     ... # Additional params passed to bayes_fitter
                                     ) 
{
  # Iterate until desired acceptance rate is obtained
  cont = TRUE
  psd = psd_init
  it = 0
  while(cont) {
    it = it+1
    if (it > max_it) {
      warning("The ")
    }
    fit = bayes_fitter(x, psd=psd, nit=nit_search, do_diagn=FALSE, 
                       do_autoreg=FALSE,...)
    acc_rates = fit$acceptance_rate[1, 1:3]
    
    too_high = acc_rates > .4
    too_low = acc_rates < .2
    
    if (all(!too_high) && all(!too_low)) { # All acceptance rates lie within threshold
      cont=FALSE
    } else if (it > max_it) { # max_it is reached
      cont=FALSE
      warning("max_it was reached")
    } else { # Correct values which have wrong threshold
      psd[too_high] = psd[too_high] * 1.5
      psd[too_low] = psd[too_low] / 2
    }
  }
  
  # Fit final model
  bayes_fitter(x, psd=psd, nit=nit_full, do_diagn=do_diagn, 
               do_autoreg=do_autoreg, ...)
  
}
                                     

monthly_bayes_fit = lapply(monthly_max, bayes_fitter_param_search, do_diagn = TRUE, 
                           do_autoreg = TRUE,
                           psd = c(500,0.3,0.3), nit_full=30000, nit_search = 30000,
                           thinning = 300)
acceptance_rates = lapply(monthly_bayes_fit, function(x) x$acceptance_rate[1, ])
print(acceptance_rates)
bayes_params = lapply(monthly_bayes_fit, function(x) apply(x$posterior, 2, median))
bayes_stderr = lapply(monthly_bayes_fit, function(x) apply(x$posterior, 2, sd))

TODO-> R largest fit

PART 2 We perform 2 tests, first for linear depenance, then for step dependance (trend = int(rescaled(trend)>0))

First, we check if the location parameter depends on time using a likelihood ratio test

#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
trend = 1:length(as.data.frame(monthly_max[[12]])$V1)
trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
[1] 1
optimization may not have succeeded
[1] 2
optimization may not have succeeded
[1] 3
[1] 4
optimization may not have succeeded
[1] 5
optimization may not have succeeded
[1] 6
[1] 7
[1] 8
[1] 9
optimization may not have succeeded
[1] 10
[1] 11
optimization may not have succeeded
[1] 12
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)
plot(unlist(ratios),main="95% confidence test for time independance, \nBonferroni multiple Testing", xlab="Month",ylab="Likelihood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

Now, let’s check for independance from ENSO

#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
# split nino data into months
n = nino34
dim(n)=c(12,length(as.data.frame(monthly_max[[12]])$V1))
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  trend = n[i,]
  
  trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
[1] 1
optimization may not have succeeded
[1] 2
optimization may not have succeeded
[1] 3
[1] 4
optimization may not have succeeded
[1] 5
[1] 6
optimization may not have succeeded
[1] 7
[1] 8
optimization may not have succeeded
[1] 9
optimization may not have succeeded
[1] 10
[1] 11
[1] 12
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)
plot(unlist(ratios),main="95% confidence test for independance from ENSO, \nBonferroni multiple testing", xlab="Month",ylab="Likelyhood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

Now the same for step dependance

#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
# split nino data into months
n = nino34
dim(n)=c(12,length(as.data.frame(monthly_max[[12]])$V1))
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  trend = n[i,]
  trend = as.integer(trend>0)
  
  trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
[1] 1
[1] 2
[1] 3
optimization may not have succeeded
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
optimization may not have succeeded
[1] 10
[1] 11
optimization may not have succeeded
[1] 12
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)
plot(unlist(ratios),main="95% confidence test for independance from ENSO, \nBonferroni multiple testing", xlab="Month",ylab="Likelyhood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
trend = 1:length(as.data.frame(monthly_max[[12]])$V1)
trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
trend = as.integer(trend>0)
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
[1] 1
[1] 2
optimization may not have succeeded
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
optimization may not have succeeded
[1] 10
[1] 11
optimization may not have succeeded
[1] 12
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)
plot(unlist(ratios),main="95% confidence test for time independance, \nBonferroni multiple Testing", xlab="Month",ylab="Likelihood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

PART 3 We will now analyse temporal clustering of extremes. For this, we will use the exiplot function from the evd library.

# define a function for getting the extremal indices for each month for a given threshold
monthly_eindex <- function(data, threshold_p, r=0){
  ei = list()
  for (i in 1:length(data)) {
    threshold = quantile(as.data.frame(data[[i]])$V1, threshold_p)
    ei[[i]]=exi(as.data.frame(data[[i]])$V1, threshold, r)
  }
  names(ei) = names(data)
  return(ei)
}
ei = monthly_eindex(get_monthly(prod_ts), 0.95)
plot(unlist(ei), main="Extremal Index by Month, 95%-Quantile as Threshold", xlab="Month", ylab="Extremal index")

We observe that the extremal index is 0.25-0.45, we can therefore conclude that we have strong dependance of extremes, with the limiting mean cluster size being roughly from 2 to 4. The clustering has no effect for estimaters based on the (monthly) maximum, but the r largest estimator is influenced by it.

PART 4 First, let’s estimate the 10 year return level using point process approach

monthly_fits_pp = list()
monthly_data = get_monthly(prod_ts)
error_cases = c(5, 9)
month_days = c(31,28,31,30,31,30,31,31,30,31,30,31)
for (i in 1:length(monthly_max)) {
  print(i)
  threshold = quantile(as.data.frame(monthly_data[[i]])$V1, 0.95)
  
  if (i %in% error_cases) {
    monthly_fits_pp[[i]] = fpot(as.data.frame(monthly_data[[i]])$V1,
                             threshold = threshold,
                             model="pp",
                             npp = month_days[i]*8,
                             cmax = TRUE,
                             r = 1,
                             std.err = FALSE,
                             method = "Nelder-Mead")
  }
  else {
    monthly_fits_pp[[i]] = fpot(as.data.frame(monthly_data[[i]])$V1,
                             threshold = threshold,
                             model="pp",
                             npp = month_days[i]*8,
                             cmax = TRUE,
                             r = 1,
                             method = "Nelder-Mead")
  }
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
names(monthly_fits_pp) = names(monthly_data)
for(i in 1:12){
  par(mfrow=c(2,2)) 
  plot(monthly_fits_pp[[i]])
}

The fit in february has completely failed, and the others are not very good either

We will still estimate the return levels:

return_level = function(x,period=20){
  if (is.list(x)) {
    loc = x$estimate[[1]]
    scale = x$estimate[[2]]
    shape = x$estimate[[3]]
  }
  if (is.vector(x)) {
    loc = x[1]
    scale = x[2]
    shape = x[3]
  }
  p = 1/period
  
  level = loc + scale*(((-log(1-p))^-shape-1)/shape)
  return(level)
}
return_level_20 = lapply(monthly_fits, return_level) # 20 for testing
return_level_100 = lapply(monthly_fits, return_level, period=100)
return_level_50 = lapply(monthly_fits, return_level, period=50)
plot(unlist(return_level_100),main="100 Year Return level, estimated with point process", xlab="Month",ylab="Return Level")

plot(unlist(return_level_50),main="50 Year Return level, estimated with point process", xlab="Month",ylab="Return Level")

TODO: estimat with mcmc Assuming that we have the posterior densities of the markov chains, call theis function to plot return level histograms

return_level_mcmc = function(posterior,period=20, plot=F){
  u = mc.quant(posterior,p=1-1/period,lh="gev")
  label_mcmc_rl = sprintf("%s Year Return Level",period)
  if(plot) hist(u,nclass=20,prob=T,xlab=label_mcmc_rl, main = "Return Level Histogram")
}

lapply(monthly_bayes_fit, function(x) return_level_mcmc(x$posterior, period=50))
lapply(monthly_bayes_fit, function(x) return_level_mcmc(x$posterior, period=100))

bayes_50yr_retlvl = lapply(bayes_params, return_level, period=50)
bayes_100yr_retlvl = lapply(bayes_params, return_level, period=100)

** Part 6** We now examine the assymptotic dependance between CAPE and SRH. This can be done by using chi-chibar plots. To eliminate the seasonality, we again consider the monthly data

# plot the chi plot for dependance analysis
data_overall_ts = xts(cbind(cape,srh), order.by = rep(dates, each=8))
data_monthly_bivariate = get_monthly(data_overall_ts)
for (i in 1:length(data_monthly_bivariate)) {
  cape.local = as.numeric(data_monthly_bivariate[[i]]$cape)
  srh.local = as.numeric(data_monthly_bivariate[[i]]$srh)
  dat.cape_srh = cbind(cape.local,srh.local);
  par(mfrow=c(1,2))
  label_chi = sprintf("Chi plot Month %s",i)
  label_chibar = sprintf("Chi Bar plot Month %s",i)
  chiplot(dat.cape_srh, main1 = label_chi, which=1);
  abline(a=0,b=0,col="red")
  chiplot(dat.cape_srh, main2 = label_chibar, which=2);
}

It seems that the confidence intervals for chi contain the 0 horizontal line, we can therefore not conclude that cape and srh are dependant.

Part 7 We will now fit different bivariate models to the monthly maxima and compare them using AIC. We will also estimate the dependance using the logistic link. (if the confidence interval for dep does not contain 1, then we can conclude that there is dependance).

# prepare the data
monthly_max.cape = get_monthly(apply.monthly(data_overall_ts$cape, max))
monthly_max.srh = get_monthly(apply.monthly(data_overall_ts$srh, max))

# fit the different models
zeros = 0*(1:12)
AIC_df = data.frame(zeros,zeros,zeros,zeros,zeros,zeros)
dep.estimate = zeros
dep.sd = zeros

bv_evd_models = c("log","alog","neglog","bilog","ct","negbilog")
colnames(AIC_df) = bv_evd_models

bv_fits = list()
for (i in 1:12) {
  print(i)
  # fit the data
  monthly_max_bivariate = cbind(as.numeric(monthly_max.cape[[i]]),
                                as.numeric(monthly_max.srh[[i]]))
  # fit1 = fbvevd(monthly_max_bivariate,model = "log")
  # fit2 = fbvevd(monthly_max_bivariate,model = "alog",std.err = FALSE)
  # fit3 = fbvevd(monthly_max_bivariate,model="neglog") 
  # fit4 = fbvevd(monthly_max_bivariate,model="bilog", std.err = FALSE) 
  # fit5 = fbvevd(monthly_max_bivariate,model="ct", std.err = FALSE) 
  # fit6 = fbvevd(monthly_max_bivariate,model="negbilog", std.err = FALSE)
  
  fit1 = fbvevd(monthly_max_bivariate,model = "log", method = "Nelder-Mead")
  fit2 = fbvevd(monthly_max_bivariate,model = "alog", method = "Nelder-Mead", std.err = FALSE)
  fit3 = fbvevd(monthly_max_bivariate,model="neglog",  method = "Nelder-Mead") 
  fit4 = fbvevd(monthly_max_bivariate,model="bilog",  method = "Nelder-Mead", std.err = FALSE) 
  fit5 = fbvevd(monthly_max_bivariate,model="ct",  method = "Nelder-Mead", std.err = FALSE) 
  fit6 = fbvevd(monthly_max_bivariate,model="negbilog", method = "Nelder-Mead", std.err = FALSE)

  bv_fits[[i]] = list(fit1, fit2, fit3, fit4, fit5, fit6)
  names(bv_fits[[i]]) = bv_evd_models
  
  # plot the fit for diagnostics
  par(mfrow=c(3,2)) 
  #plot(fit1)
  
  # keep track of results
  dep.estimate[i] = as.numeric(fit3$estimate["dep"])
  dep.sd[i] = as.numeric(fit3$std.err["dep"])
  aics = c(AIC(fit1), AIC(fit2), AIC(fit3), AIC(fit4), AIC(fit5), AIC(fit6))
  AIC_df[i,] = aics

}
names(bv_fits) = month_names

# show AICs for model selection
print(AIC_df)
# plot depencance values for depenance analysis.
par(mfrow=c(1,1))
plot_w_err(1:12, dep.estimate, dep.sd, 1.96,"Monthly Dependance Parameter vs Independance (Red)")
abline(a=0,b=0,col="red")

When plotting the fit diagnostics, the following look good: fit1, fit2, fit4, fit5, fit6

The fit could not be plotted for fit3.

We believe that there is no reason to assume that a different model should be used for different months, so we chose the logistic model. It is one of the best models in every month, and if it isn’t, the difference is only very small.

The plot of the dependance parameter shows that our analysis does coincides with what we saw for the chi-chibar plot, the two series seem to have independant extremes.

# Return levels for model with best AUC for every month
bivar_100yr_retlvl = vector()
bivar_50yr_retlvl = vector()
for (i in 1:12) {
  n_sims = 500000
  mod_ix = c("neglog"=3) #Always use neglog
  #mod_ix = which.min(AIC_df[i, ]) # Index of model with best AIC for given month
  estimate = bv_fits[[i]][[mod_ix]]$estimate
  model=names(mod_ix)
  # Pass appropriate parameters according to model
  if (model %in% c("log", "neglog")) {
    sim_vals = rbvevd(n=n_sims,
                      dep=estimate["dep"],
                      mar1=estimate[c("loc1", "scale1", "shape1")],
                      mar2=estimate[c("loc2", "scale2", "shape2")],
                      model=model)
  }
  else if (model %in% c("bilog", "ct", "negbilog")) {
    sim_vals = rbvevd(n=n_sims,
                      alpha=estimate["alpha"],
                      beta=estimate["beta"],
                      mar1=estimate[c("loc1", "scale1", "shape1")],
                      mar2=estimate[c("loc2", "scale2", "shape2")],
                      model=model)
  }
  else if (model %in% c("alog")) {
    sim_vals = rbvevd(n=n_sims,
                      dep=estimate["dep"],
                      asy=c(estimate["asy1"], estimate["asy2"]),
                      mar1=estimate[c("loc1", "scale1", "shape1")],
                      mar2=estimate[c("loc2", "scale2", "shape2")],
                      model=model)
  }
  # Obtain simulated PROD value, get return levels
  sim_prod = sqrt(sim_vals[ , 1]) * sim_vals[ , 2]
  
  # GET RETURN LEVELS WITH MEAN OF MAX, DOESN'T REALLY MAKE SENSE
  # #bivar_100yr_retlvl[i] = mean(apply(matrix(sim_prod, ncol=100), 2, max, na.rm=T),
  #                              na.rm=T)
  # #bivar_50yr_retlvl[i] = mean(apply(matrix(sim_prod, ncol=50), 2, max, na.rm=T),
  #                             na.rm=T)
  # TAKE N-TH LARGEST ORDER STATISTIC, WITH N = N_SIMS/RET_INTERVAL
  #bivar_100yr_retlvl[i] = sort(sim_prod, #partial=n_sims/100,
                               #decreasing=T)[n_sims/100]
  #bivar_50yr_retlvl[i] = sort(sim_prod, #partial=n_sims/50,
                              #decreasing=T)[n_sims/50]
  # USE QUANTILES
  bivar_100yr_retlvl[i] = quantile(sim_prod, probs=1-1/100, na.rm = T)
  bivar_50yr_retlvl[i] = quantile(sim_prod, probs =1-1/50, na.rm = T)
}
NaNs wurden erzeugtNaNs wurden erzeugtNaNs wurden erzeugtNaNs wurden erzeugtNaNs wurden erzeugtNaNs wurden erzeugtNaNs wurden erzeugtNaNs wurden erzeugt
plot(1:12, return_level_100, col="blue", ylim=c(0, 100000),
     main="100 Year Return Levels", pch=19)
points(1:12, bivar_100yr_retlvl, col="red", pch=19)
points(1:12, bayes_100yr_retlvl, col="green", pch=19)
points(1:12, mle_100yr_retlvl, col="orange", pch=19)
legend("topright", c("BAY", "POI", "BIV", "MLE"), 
       fill=c("green","blue","red", "orange"))

plot(1:12, return_level_50, col="blue", ylim=c(0, 100000),
     main="50 Year Return Levels", pch=19)
points(1:12, bivar_50yr_retlvl, col="red", pch=19)
points(1:12, bayes_50yr_retlvl, col="green", pch=19)
points(1:12, mle_50yr_retlvl, col="orange", pch=19)
legend("topright", c("BAY", "POI", "BIV", "MLE"), 
       fill=c("green","blue","red", "orange"))

---
title: "Thunderstrom Analysis"
output: html_notebook
---
**Imports**
```{r}
library(evd); 
library(evdbayes);
library(coda);
library(ismev);
library(xts);
library(ggplot2);
```


**Loading the Data**

```{r}
load("../data/CAPE_Minder_Rychener_Malsot.RData")
load("../data/NINO34.RData")
load("../data/SRH_Minder_Rychener_Malsot.RData")
```

**Generate PROD**
```{r}
prod = sqrt(cape)*srh
```



** Create Time Series Objects **
```{r}
dates <- seq.Date(as.Date("1979-1-1"), as.Date("2015-12-31"), by=1)
feb29ix <- format(as.Date(dates), "%m-%d") == "02-29"
dates <- dates[!feb29ix]

prod_ts = xts(prod, order.by = rep(dates, each=8))
```


**Plot of Time Series**
```{r}
ts_plot = autoplot(prod_ts) + xlab("Time") + ylab("PROD")
ggsave("../plots/full_time_series.pdf", plot=ts_plot, width = 6, height = 3)
ggsave("../plots/full_time_series.png", plot=ts_plot, width = 3, height = 1.5)

prod_full_monthly_max = apply.monthly(prod_ts, max)
max_ts_plot = autoplot(prod_full_monthly_max) +
  xlab("Time") + ylab("PROD") +
  geom_line(size=.5)
ggsave("../plots/monthly_max_series.pdf", plot=max_ts_plot, width = 6, height = 3)
ggsave("../plots/monthly_max_series.png", plot=max_ts_plot, width = 3, height = 1.5)

nino34_plot = ggplot() + geom_line(aes(index(prod_full_monthly_max), nino34)) +
  xlab("Time") + ylab("NINO 3.4")
ggsave("../plots/nino34_plot.pdf", plot=nino34_plot, width = 6, height = 3)
ggsave("../plots/nino34_plot.png", plot=nino34_plot, width = 3, height = 1.5)


```



**Group Monthly**
```{r}
month_names = c("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec")
get_monthly = function(x) {
  output = list()
  len = nrow(x)
  
  # Get month of element
  month = time(x)
  month = gsub("....-", "", month)
  month = gsub("-..", "", month)
  monthlist = unique(month)
  for (i in 1:12) {
    output[[i]] = x[month == monthlist[i],]
  }
  names(output) = month_names
  return(output)
}

monthly_max = get_monthly(apply.monthly(prod_ts, max))
r = 2
r_monthly_max = get_monthly(apply.monthly(prod_ts, function(x) order(x, decreasing=T)[1:r]))
```

**Plot Monthly Time Series**
```{r}
for (i in 1:12) {
  tmp = autoplot(monthly_max[[i]]) + xlab("Time") + ylab("PROD")
  ggsave(paste("../plots/monthly_max_series/", sprintf("%02d", i), "monthly_max_series.pdf", sep=""), 
         plot=tmp, width = 6, height = 3)
}
```


```{r}
# get the monthly maxima of prod
m1 = as.data.frame(apply.monthly(prod_ts, max))$V1;
# produce the gumbel qq plot
gumbel_qq = function(x, title="") qqplot(qgumbel(c(1:length(x))/(length(x)+1)),
                                         x,
                                         main = paste("Gumbell Q-Q Plot", title),
                                         xlab = "Theoretical Quantiles", 
                                         ylab = "Sample Quantiles") ;

gumbel_qq(m1)

#qqplot(qgumbel(c(1:length(m1))/(length(m1)+1)),m1,main = "Gumbell Q-Q Plot",xlab = "Theoretical Quantiles", ylab = "Sample Quantiles") ;
```
The qq plot doesn't fit very well, especially in the lower tail. This is likely due
to seasonal dependence.

**Fitting GEV to the entire data**
```{r}
# fit gevd with MLE and produce diagnostic plots
fitmax.MLE<-fgev(m1,method="Nelder-Mead")
par(mfrow=c(2,2))
fitmax.MLE
plot(fitmax.MLE)
```
Poor fit, probably because the distribution isn't stationary. This is especially 
visible in the Probability plot, in which the confidence band is surpassed, 
indicating a poor fit.


```{r}
# fit gevd with Bayesian Techniques
# use the previous outputs (rounded) as initial values (use a different shape)
init<-c(1.6e4,4e3,0.1)
# arbitrary priors
mat <- diag(c(10000,10000,100)) 
pn <- prior.norm(mean=c(0,0,0),cov=mat)
# proposal standard deviation (found by trial and error to get 20%<acceptance rate<40%)
psd<-c(500,0.03,0.02)
# draw 3k samples from markov chain
nit = 30000
thinning = 300
post<-posterior(nit, init=init,prior=pn,lh="gev",data=m1,psd=psd)
# diagnostic plots
MCMC<-mcmc(post[1:nit %% thinning == 0, ], thin=300) 
plot(MCMC) 
attr(mcmc(post),"ar")

```


```{r}
#MCMC_stab <- mcmc(post, thin=50, start=1000)
acf(mcmc(post[1:nit %% thinning == 0, ]))
```
We observe that there seem to be no substantial issues from the autocorrelation 
plots. 

```{r}
apply(mcmc(post[1:nit %% thinning == 0, ]),2,mean)
apply(mcmc(post[1:nit %% thinning == 0, ]),2,sd)

```

** Fit with MLE for months separately**
```{r}
#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
monthly_fits = list()
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  if (i %in% error_cases) {
    monthly_fits[[i]] = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  }
  else {
    monthly_fits[[i]] = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead")
  }
}
names(monthly_fits) = names(monthly_max)
```

```{r}
for (i in 1:12) {
  pdf(paste("../plots/monthly_mle_diag/", sprintf("%02d", i), "_monthly_mle_diag.pdf", sep=""),
      width=7, height=7)
  par(mfrow=c(2,2))
  plot(monthly_fits[[i]])
  dev.off()
}
```


```{r}
gumbel_qq(data.frame(monthly_max[[9]])[,1], "September")
gumbel_qq(data.frame(monthly_max[[12]])[,1], "December")
```

**Fit R largest order statistics**
```{r}

largest_10 = get_monthly(apply.monthly(prod_ts, function(x) order(x, decreasing=T)[1:4]))
largest_2_fit = lapply(largest_10, function(x) rlarg.fit(x[ , 1:2]))

```

```{r}
get_se = function(x, ix) {
  if (is.null(x$std.err)) 0
  else x$std.err[ix]
}
mle_loc = unlist(lapply(monthly_fits, function(x) x$estimate[1]))
mle_loc_se = unlist(lapply(monthly_fits, get_se, 1))
mle_scale = unlist(lapply(monthly_fits, function(x) x$estimate[2]))
mle_scale_se = unlist(lapply(monthly_fits, get_se, 2))
mle_shape = unlist(lapply(monthly_fits, function(x) x$estimate[3]))
mle_shape_se = unlist(lapply(monthly_fits, get_se, 3))

mle_50yr_retlvl = lapply(monthly_fits, return_level, period=50)
mle_100yr_retlvl = lapply(monthly_fits, return_level, period=100)

```


```{r}
plot_w_err = function(x, y, se, se.conf_mult = 1,title = NULL) {
  upper_error = y+se.conf_mult*se
  lower_error = y-se.conf_mult*se
  
  max_y = max(upper_error)
  min_y = min(lower_error)
  
  plot(x, y,
       ylim = c(min_y, max_y),
       main = title)
  arrows(x,upper_error,x,lower_error, code=3, length=0.02, angle = 90)
}
plot_w_err(1:12, mle_loc, mle_loc_se, 1, "Location Parameter as Estimated with Likelihood")
plot_w_err(1:12, mle_scale, mle_scale_se, 1, "Scale Parameter as Estimated with Likelihood")
plot_w_err(1:12, mle_shape, mle_shape_se, 1, "Shape Parameter as Estimated with Likelihood")

```

** Fit with Bayesian Methods for months separately**
```{r}


# Fits GEV distribution with bayesian method for given parameters
bayes_fitter = function(x, 
                        init = c(1e3, 1e3, 0.1), # Initial values
                        mat = diag(c(10000,10000,100)),
                        psd = c(500,0.1,0.1), # Proposed SDev
                        nit = 3000, # Nb Iterations
                        thinning = 50, # Thinning Factor
                        do_diagn = FALSE, # Bool whether to show diagnostic plots
                        do_autoreg = FALSE, # Bool whether to show autoreg plots
                        seed = 42 # Seed 
                        ) {
  set.seed(seed)                
  pn = prior.norm(mean=c(0,0,0),cov=mat)
  post = posterior(nit, init=init, prior=pn, lh="gev", data=x, psd=psd)
  
  if(do_diagn) {
    MCMC<-mcmc(post[1:nit %% thinning == 0, ]) 
    plot(MCMC) 
  }
  if(do_autoreg) {
    acf(mcmc(post[1:nit %% thinning == 0, ]))
  }
  list(posterior = post, 
       acceptance_rate = attr(mcmc(post),"ar"))
}



# Iteratively fits GEV with bayesian methods, until the fit has 
# acceptable acceptance rates (i.e. 0.2 < AR < 0.4). If the AR is too high, 
# the proposed SDev for the parameter is multiplied with 1.5. If it's too small,
# the proposed SDev is divided by 2. This is repeated until the acceptance rate
# is good for all parameters, or max_it is reached. Then, a final model is fitted
# with more iterations. 
bayes_fitter_param_search = function(x,
                                     psd_init = c(500,0.1,0.1), # Initial proposed SDev
                                     nit_full = 3000, # Nb Iterations for final model
                                     nit_search = 150, # Nb Iterations for param search
                                     do_diagn = FALSE, # Bool whether to show diagnostic plots
                                     do_autoreg = FALSE, # Bool whether to show autoreg plots
                                     max_it = 20,
                                     ... # Additional params passed to bayes_fitter
                                     ) 
{
  # Iterate until desired acceptance rate is obtained
  cont = TRUE
  psd = psd_init
  it = 0
  while(cont) {
    it = it+1
    if (it > max_it) {
      warning("The ")
    }
    fit = bayes_fitter(x, psd=psd, nit=nit_search, do_diagn=FALSE, 
                       do_autoreg=FALSE,...)
    acc_rates = fit$acceptance_rate[1, 1:3]
    
    too_high = acc_rates > .4
    too_low = acc_rates < .2
    
    if (all(!too_high) && all(!too_low)) { # All acceptance rates lie within threshold
      cont=FALSE
    } else if (it > max_it) { # max_it is reached
      cont=FALSE
      warning("max_it was reached")
    } else { # Correct values which have wrong threshold
      psd[too_high] = psd[too_high] * 1.5
      psd[too_low] = psd[too_low] / 2
    }
  }
  
  # Fit final model
  bayes_fitter(x, psd=psd, nit=nit_full, do_diagn=do_diagn, 
               do_autoreg=do_autoreg, ...)
  
}
                                     

monthly_bayes_fit = lapply(monthly_max, bayes_fitter_param_search, do_diagn = TRUE, 
                           do_autoreg = TRUE,
                           psd = c(500,0.3,0.3), nit_full=30000, nit_search = 30000,
                           thinning = 300)
acceptance_rates = lapply(monthly_bayes_fit, function(x) x$acceptance_rate[1, ])
print(acceptance_rates)
bayes_params = lapply(monthly_bayes_fit, function(x) apply(x$posterior, 2, median))
bayes_stderr = lapply(monthly_bayes_fit, function(x) apply(x$posterior, 2, sd))


```


TODO-> R largest fit



**PART 2**
We perform 2 tests, first for linear depenance, then for step dependance (trend = int(rescaled(trend)>0))

First, we check if the location parameter depends on time using a likelihood ratio test
```{r}
#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
trend = 1:length(as.data.frame(monthly_max[[12]])$V1)
trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)

plot(unlist(ratios),main="95% confidence test for time independance, \nBonferroni multiple Testing", xlab="Month",ylab="Likelihood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

```

Now, let's check for independance from ENSO
```{r}
#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
# split nino data into months
n = nino34
dim(n)=c(12,length(as.data.frame(monthly_max[[12]])$V1))
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  trend = n[i,]
  
  trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)

plot(unlist(ratios),main="95% confidence test for independance from ENSO, \nBonferroni multiple testing", xlab="Month",ylab="Likelyhood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

```

Now the same for step dependance
```{r}
#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
# split nino data into months
n = nino34
dim(n)=c(12,length(as.data.frame(monthly_max[[12]])$V1))
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  trend = n[i,]
  trend = as.integer(trend>0)
  
  trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)

plot(unlist(ratios),main="95% confidence test for independance from ENSO, \nBonferroni multiple testing", xlab="Month",ylab="Likelyhood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")

```


```{r}
#monthly_fits = lapply(monthly_max, 
#                      function(x) fgev(data.frame(x)[,1], method="Nelder-Mead"))
ratios = list()
trend = 1:length(as.data.frame(monthly_max[[12]])$V1)
trend = (trend-mean(trend))/sd(trend) # scale and center covariates as recommended
trend = as.integer(trend>0)
error_cases = c(9, 12)
for (i in 1:length(monthly_max)) {
  print(i)
  
  fit_const = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             std.err = FALSE)
  fit_dependant = fgev(as.data.frame(monthly_max[[i]])$V1, 
                             method = "Nelder-Mead",
                             nsloc = trend,
                             std.err = FALSE)
  
  ratios[[i]] = fit_const$dev-fit_dependant$dev 
}
names(ratios) = names(monthly_max)
chi_95level = qchisq(1-0.05/12,1)

plot(unlist(ratios),main="95% confidence test for time independance, \nBonferroni multiple Testing", xlab="Month",ylab="Likelihood Ratio Statistic")
abline(a=chi_95level,b=0,col="red")



```



**PART 3**
We will now analyse temporal clustering of extremes. For this, we will use the exiplot function from the evd library.

```{r}
# define a function for getting the extremal indices for each month for a given threshold
monthly_eindex <- function(data, threshold_p, r=0){
  ei = list()
  for (i in 1:length(data)) {
    threshold = quantile(as.data.frame(data[[i]])$V1, threshold_p)
    ei[[i]]=exi(as.data.frame(data[[i]])$V1, threshold, r)
  }
  names(ei) = names(data)

  return(ei)
}

ei = monthly_eindex(get_monthly(prod_ts), 0.95)
plot(unlist(ei), main="Extremal Index by Month, 95%-Quantile as Threshold", xlab="Month", ylab="Extremal index")
```

We observe that the extremal index is ~0.25-~0.45, we can therefore conclude that we have strong dependance of extremes, with the limiting mean cluster size being roughly from 2 to 4. The clustering has no effect for estimaters based on the (monthly) maximum, but the r largest estimator is influenced by it.


**PART 4**
First, let's estimate the 10 year return level using point process approach
```{r}
monthly_fits_pp = list()
monthly_data = get_monthly(prod_ts)
error_cases = c(5, 9)
month_days = c(31,28,31,30,31,30,31,31,30,31,30,31)
for (i in 1:length(monthly_max)) {
  print(i)
  threshold = quantile(as.data.frame(monthly_data[[i]])$V1, 0.95)
  
  if (i %in% error_cases) {
    monthly_fits_pp[[i]] = fpot(as.data.frame(monthly_data[[i]])$V1,
                             threshold = threshold,
                             model="pp",
                             npp = month_days[i]*8,
                             cmax = TRUE,
                             r = 1,
                             std.err = FALSE,
                             method = "Nelder-Mead")
  }
  else {
    monthly_fits_pp[[i]] = fpot(as.data.frame(monthly_data[[i]])$V1,
                             threshold = threshold,
                             model="pp",
                             npp = month_days[i]*8,
                             cmax = TRUE,
                             r = 1,
                             method = "Nelder-Mead")
  }
}
names(monthly_fits_pp) = names(monthly_data)
for(i in 1:12){
  par(mfrow=c(2,2)) 
  plot(monthly_fits_pp[[i]])
}

```
The fit in february has completely failed, and the others are not very good either

We will still estimate the return levels:
```{r}
return_level = function(x,period=20){
  if (is.list(x)) {
    loc = x$estimate[[1]]
    scale = x$estimate[[2]]
    shape = x$estimate[[3]]
  }
  if (is.vector(x)) {
    loc = x[1]
    scale = x[2]
    shape = x[3]
  }
  p = 1/period
  
  level = loc + scale*(((-log(1-p))^-shape-1)/shape)
  return(level)
}
return_level_20 = lapply(monthly_fits, return_level) # 20 for testing
return_level_100 = lapply(monthly_fits, return_level, period=100)
return_level_50 = lapply(monthly_fits, return_level, period=50)
plot(unlist(return_level_100),main="100 Year Return level, estimated with point process", xlab="Month",ylab="Return Level")
plot(unlist(return_level_50),main="50 Year Return level, estimated with point process", xlab="Month",ylab="Return Level")

```


TODO: estimat with mcmc
Assuming that we have the posterior densities of the markov chains, call theis function to plot return level histograms

```{r}
return_level_mcmc = function(posterior,period=20, plot=F){
  u = mc.quant(posterior,p=1-1/period,lh="gev")
  label_mcmc_rl = sprintf("%s Year Return Level",period)
  if(plot) hist(u,nclass=20,prob=T,xlab=label_mcmc_rl, main = "Return Level Histogram")
}

lapply(monthly_bayes_fit, function(x) return_level_mcmc(x$posterior, period=50))
lapply(monthly_bayes_fit, function(x) return_level_mcmc(x$posterior, period=100))

bayes_50yr_retlvl = lapply(bayes_params, return_level, period=50)
bayes_100yr_retlvl = lapply(bayes_params, return_level, period=100)


```



** Part 6**
We now examine the assymptotic dependance between CAPE and SRH. This can be done by using chi-chibar plots. To eliminate the seasonality, we again consider the monthly data
```{r}
# plot the chi plot for dependance analysis
data_overall_ts = xts(cbind(cape,srh), order.by = rep(dates, each=8))
data_monthly_bivariate = get_monthly(data_overall_ts)

for (i in 1:length(data_monthly_bivariate)) {
  cape.local = as.numeric(data_monthly_bivariate[[i]]$cape)
  srh.local = as.numeric(data_monthly_bivariate[[i]]$srh)
  dat.cape_srh = cbind(cape.local,srh.local);
  par(mfrow=c(1,2))
  label_chi = sprintf("Chi plot Month %s",i)
  label_chibar = sprintf("Chi Bar plot Month %s",i)
  chiplot(dat.cape_srh, main1 = label_chi, which=1);
  abline(a=0,b=0,col="red")
  chiplot(dat.cape_srh, main2 = label_chibar, which=2);
}
```

It seems that the confidence intervals for chi contain the 0 horizontal line, we can therefore not conclude that cape and srh are dependant.



**Part 7**
We will now fit different bivariate models to the monthly maxima and compare them using AIC. We will also estimate the dependance using the logistic link. (if the confidence interval for dep does not contain 1, then we can conclude that there is dependance).

```{r}
# prepare the data
monthly_max.cape = get_monthly(apply.monthly(data_overall_ts$cape, max))
monthly_max.srh = get_monthly(apply.monthly(data_overall_ts$srh, max))

# fit the different models
zeros = 0*(1:12)
AIC_df = data.frame(zeros,zeros,zeros,zeros,zeros,zeros)
dep.estimate = zeros
dep.sd = zeros

bv_evd_models = c("log","alog","neglog","bilog","ct","negbilog")
colnames(AIC_df) = bv_evd_models

bv_fits = list()
for (i in 1:12) {
  print(i)
  # fit the data
  monthly_max_bivariate = cbind(as.numeric(monthly_max.cape[[i]]),
                                as.numeric(monthly_max.srh[[i]]))
  # fit1 = fbvevd(monthly_max_bivariate,model = "log")
  # fit2 = fbvevd(monthly_max_bivariate,model = "alog",std.err = FALSE)
  # fit3 = fbvevd(monthly_max_bivariate,model="neglog") 
  # fit4 = fbvevd(monthly_max_bivariate,model="bilog", std.err = FALSE) 
  # fit5 = fbvevd(monthly_max_bivariate,model="ct", std.err = FALSE) 
  # fit6 = fbvevd(monthly_max_bivariate,model="negbilog", std.err = FALSE)
  
  fit1 = fbvevd(monthly_max_bivariate,model = "log", method = "Nelder-Mead")
  fit2 = fbvevd(monthly_max_bivariate,model = "alog", method = "Nelder-Mead", std.err = FALSE)
  fit3 = fbvevd(monthly_max_bivariate,model="neglog",  method = "Nelder-Mead") 
  fit4 = fbvevd(monthly_max_bivariate,model="bilog",  method = "Nelder-Mead", std.err = FALSE) 
  fit5 = fbvevd(monthly_max_bivariate,model="ct",  method = "Nelder-Mead", std.err = FALSE) 
  fit6 = fbvevd(monthly_max_bivariate,model="negbilog", method = "Nelder-Mead", std.err = FALSE)

  bv_fits[[i]] = list(fit1, fit2, fit3, fit4, fit5, fit6)
  names(bv_fits[[i]]) = bv_evd_models
  
  # plot the fit for diagnostics
  par(mfrow=c(3,2)) 
  #plot(fit1)
  
  # keep track of results
  dep.estimate[i] = as.numeric(fit3$estimate["dep"])
  dep.sd[i] = as.numeric(fit3$std.err["dep"])
  aics = c(AIC(fit1), AIC(fit2), AIC(fit3), AIC(fit4), AIC(fit5), AIC(fit6))
  AIC_df[i,] = aics

}
names(bv_fits) = month_names

# show AICs for model selection
print(AIC_df)

```

```{r}
# plot depencance values for depenance analysis.
par(mfrow=c(1,1))
plot_w_err(1:12, dep.estimate, dep.sd, 1.96,"Monthly Dependance Parameter vs Independance (Red)")
abline(a=0,b=0,col="red")

```



When plotting the fit diagnostics, the following look good:
fit1, fit2, fit4, fit5, fit6

The fit could not be plotted for fit3.

We believe that there is no reason to assume that a different model should be used for different months, so we chose the logistic model. It is one of the best models in every month, and if it isn't, the difference is only very small.

The plot of the dependance parameter shows that our analysis does coincides with what we saw for the chi-chibar plot, the two series seem to have independant extremes.

```{r}
# Return levels for model with best AUC for every month
bivar_100yr_retlvl = vector()
bivar_50yr_retlvl = vector()

for (i in 1:12) {
  n_sims = 500000
  mod_ix = c("neglog"=3) #Always use neglog
  #mod_ix = which.min(AIC_df[i, ]) # Index of model with best AIC for given month
  estimate = bv_fits[[i]][[mod_ix]]$estimate
  model=names(mod_ix)
  # Pass appropriate parameters according to model
  if (model %in% c("log", "neglog")) {
    sim_vals = rbvevd(n=n_sims,
                      dep=estimate["dep"],
                      mar1=estimate[c("loc1", "scale1", "shape1")],
                      mar2=estimate[c("loc2", "scale2", "shape2")],
                      model=model)

  }
  else if (model %in% c("bilog", "ct", "negbilog")) {
    sim_vals = rbvevd(n=n_sims,
                      alpha=estimate["alpha"],
                      beta=estimate["beta"],
                      mar1=estimate[c("loc1", "scale1", "shape1")],
                      mar2=estimate[c("loc2", "scale2", "shape2")],
                      model=model)

  }
  else if (model %in% c("alog")) {
    sim_vals = rbvevd(n=n_sims,
                      dep=estimate["dep"],
                      asy=c(estimate["asy1"], estimate["asy2"]),
                      mar1=estimate[c("loc1", "scale1", "shape1")],
                      mar2=estimate[c("loc2", "scale2", "shape2")],
                      model=model)
  }

  # Obtain simulated PROD value, get return levels
  sim_prod = sqrt(sim_vals[ , 1]) * sim_vals[ , 2]
  
  # GET RETURN LEVELS WITH MEAN OF MAX, DOESN'T REALLY MAKE SENSE
  # #bivar_100yr_retlvl[i] = mean(apply(matrix(sim_prod, ncol=100), 2, max, na.rm=T),
  #                              na.rm=T)
  # #bivar_50yr_retlvl[i] = mean(apply(matrix(sim_prod, ncol=50), 2, max, na.rm=T),
  #                             na.rm=T)

  # TAKE N-TH LARGEST ORDER STATISTIC, WITH N = N_SIMS/RET_INTERVAL
  #bivar_100yr_retlvl[i] = sort(sim_prod, #partial=n_sims/100,
                               #decreasing=T)[n_sims/100]
  #bivar_50yr_retlvl[i] = sort(sim_prod, #partial=n_sims/50,
                              #decreasing=T)[n_sims/50]

  # USE QUANTILES
  bivar_100yr_retlvl[i] = quantile(sim_prod, probs=1-1/100, na.rm = T)
  bivar_50yr_retlvl[i] = quantile(sim_prod, probs =1-1/50, na.rm = T)

}
```

```{r}
plot(1:12, return_level_100, col="blue", ylim=c(0, 100000),
     main="100 Year Return Levels", pch=19)
points(1:12, bivar_100yr_retlvl, col="red", pch=19)
points(1:12, bayes_100yr_retlvl, col="green", pch=19)
legend("topright", c("BAY", "POI", "BIV"), 
       fill=c("green","blue","red"))

plot(1:12, return_level_50, col="blue", ylim=c(0, 100000),
     main="50 Year Return Levels", pch=19)
points(1:12, bivar_50yr_retlvl, col="red", pch=19)
points(1:12, bayes_50yr_retlvl, col="green", pch=19)
legend("topright", c("BAY", "POI", "BIV"), 
       fill=c("green","blue","red"))


```


